global preclean = 1 //must turn on if adding new cohorts
global clean_enrollment = 1
global clean_degree = 1

*ipeds and chetty cleaned before
*needs to be run post lotto assembly
*modify olivia's code

***************
// TIME HORIZON FOR OUTCOMES
***************
local weeksPerYear = 52
global n_years_out = 8

forvalues y = 1/$n_years_out {
	
	// the annual outcomes begin in July and end in the following June
	global week_start_inY`y'	"yw(cohort + `=`y'-1', (`weeksPerYear'/2) + 1)"
	global week_end_inY`y'		"yw(cohort + `y', 	(`weeksPerYear'/2))"
	global week_end_inY`y'fall	"yw(cohort + `y'-1, 	`weeksPerYear')"
	global week_start_inY`y'spring	"yw(cohort + `y',  1)"
	global week_end_inY`y'spring	"yw(cohort + `y', (`weeksPerYear'/2))"
	
	// cumulative outcomes include all windows from start of Y1 through the relevant date
	global week_start_byY`y'	${week_start_inY1}
	global week_end_byY`y'		${week_end_inY`y'}
	global week_end_byY`y'fall	${week_end_inY`y'fall}
	global week_end_byY`y'spring	${week_end_inY`y'spring}
	global week_start_byY`y'spring	${week_start_inY1}

}

#delimit ;

// college categories
global collegeCategories  
    any
    public private
    4yr 2yr
    MA MA_public 
	out OOS_public
	4yrpub 4yrMApub 4yrOOSpub 4yrpriv 4yrMApriv 4yrOOSpriv
	2yrpub 2yrMApub 2yrpriv
;
#delimit cr


if $preclean==1{
// Author: Olivia Kim
// Date: July 2016
// Purpose: This file generates enrollment records using the NSC data.

*****************************************************************************************************
*****************************************************************************************************

// only keep relevant sample for my study to reduce the file size
use "$data\lottery_applicants_voting.dta", clear
ren proj_year12 cohort
keep sasid cohort
isid sasid
save "$data/IDS.dta", replace

// load data
use "$data_clean/NSC_enrollment.dta", clear
merge m:1 sasid using "$data/IDS.dta", keep(match ) nogen

save "$data/NSC_enrollment_voter.dta", replace

use "$data_clean/NSC_degrees.dta", clear
merge m:1 sasid using "$data/IDS.dta", keep(match ) nogen

save "$data/NSC_degrees_voter.dta", replace

}
if $clean_enrollment==1{
*****************************************************************************************************
*****************************************************************************************************

*** MATCH NSC ENROLLMENT SPELL DATES TO START AND END WEEKS ***

// load data
use "$data/NSC_enrollment_voter.dta", clear
*a few won't match because the subcampus doesn't match to college data, fix that here

g sub = substr(ID_FSC,1,6) +"-00"
foreach f in 001072-05 001536-01 001540-01 001564-01 002029-01 002029-03 002155-08 002155-10 002589-03 ///
	002629-02  003388-01  009828-12 010198-17 010394-11 010727-11 030106-18 002077-02 030425-01 {
		qui replace ID_FSC= sub if ID_FSC=="`f'"
	}
	drop sub 

// merge with FSC-IPEDS crosswalk
*one school of vet medicine does not match
merge m:1 ID_FSC using "$data_clean/FSC.dta",   keep(master match) nogen

// check IPEDS IDs are properly assigned
*no IPEDS code for a college of vetrinary medicine in St. Kitts.
*assert !missing(ID_IPEDS) if ID_FSC!="022779-00"

// merge with IPEDS college characteristics
*only one that doesn't merge is the vet school
merge m:1 ID_IPEDS using "$data_clean/IPEDS.dta", /*assert(match using) */ keep(match) nogen
*assert (!missing(ipeds_years) & !missing(ipeds_ownership) & !missing(ipeds_calendar))| ID_FSC=="022779-00"

// drop existing variables
gen college_4yr = college_years == 4 if !missing(college_years)
gen college_2yr = college_years == 2 if !missing(college_years)
drop college_ownership college_years

rename ipeds_* *
gen college_MA_private = college_private*college_MA
gen college_OOS_private = college_private*college_out
gen college_4yrpub	= (college_4yr & college_public)
gen college_4yrMApub= (college_4yr & college_MA_public)
gen college_4yrOOSpub= (college_4yr & college_OOS_public)
gen college_4yrpriv	= (college_4yr & college_private)
gen college_4yrMApriv	= (college_4yr & college_private & college_MA)
gen college_4yrOOSpriv	= (college_4yr & college_private & college_out)
gen college_2yrpub	= (college_2yr & college_public)
gen college_2yrMApub= (college_2yr & college_MA_public)
gen college_2yrpriv	= (college_2yr & college_private)
// convert dates to weeks
foreach time in start_date end_date {
	gen week_`time' = wofd(term_`time')
	format week_`time' %tw
}

// expand to generate all weeks during term for each enrollment spell - now one obs for each week
sort sasid college week_start
gen spell_nsc = _n
gen n_weeks = (week_end - week_start) + 1
expand n_weeks
bysort spell_nsc: gen week = week_start + (_n - 1)
format week %tw

// flag NSC-provided terms
gen source_nsc = 1

	// if there are multiple records for a single week, default to the later recod
	// b/c we think it's a revision of the first and is therfore more accurate. 
	bysort sasid college week (term_start_date term_end_date) NSCdatayear: keep if (_n == _N)

// rename variables for merging with outcomes data
keep sasid ID_FSC ID_IPEDS college week source spell enrollment ///
college_state years calendar ownership cohort college_* NSCdatayear 


save "${data}/nsc_enroll_cln.dta", replace

*****************************************************************************************************
*****************************************************************************************************

// generate enrollment windows

use "${data}/nsc_enroll_cln.dta", clear

	// find consecutive subspells in spells from NSC that didn't match school records 
	egen ID_tsset = group(sasid college spell_nsc), missing
	sort ID_tsset week
	gen subspell = 1
	replace subspell = subspell[_n-1]+1 if (ID_tsset == ID_tsset[_n-1]) & (week != week[_n-1] + 1)

	// update start and end weeks of sub spells
	bysort ID_tsset subspell: egen week_start = min(week)
	bysort ID_tsset subspell: egen week_end = max(week)
	gen weeks = (week_end - week_start) + 1
	format week_* %tw

// get one record for each spell
drop week ID_tsset *spell*
bysort _all: keep if (_n == 1) 
isid sasid college week_start week_end 

**************************
// loop over time windows
**************************

forvalues y = 1/$n_years_out {
foreach prefix in "in" "by" {

	// specify time
	local time `prefix'Y`y'             

	// get date that overlap period starts and ends
	qui gen overlap_start 	= max(week_start, ${week_start_`time'})
	qui gen overlap_end 	= min(week_end, ${week_end_`time'}) 
	
	// overlap period is min of the difference and zero
	qui gen overlap = max(overlap_end - overlap_start + 1, 0)
	
	// scale by term length
	qui gen spell_share_`time' = (overlap / weeks) 
	label var spell_share_`time' "share of enrollment spell in `time' window"
	drop overlap*
	
	// specify time
	local time `prefix'Y`y'             

	// get date that overlap period starts and ends
	qui gen overlap_start 	= max(week_start, ${week_start_`time'})
	qui gen overlap_end 	= min(week_end, ${week_end_`time'fall}) 
	
	// overlap period is min of the difference and zero
	qui gen overlap = max(overlap_end - overlap_start + 1, 0)
	
	// scale by term length
	qui gen spell_share_`time'fall = (overlap / weeks) 
	label var spell_share_`time'fall "share of enrollment spell in `time' fall window"
	drop overlap*
	
	// get date that overlap period starts and ends
	qui gen overlap_start 	= max(week_start, ${week_start_`time'spring})
	qui gen overlap_end 	= min(week_end, ${week_end_`time'spring}) 
	
	// overlap period is min of the difference and zero
	qui gen overlap = max(overlap_end - overlap_start + 1, 0)
	
	// scale by term length	
	qui gen spell_share_`time'spring = (overlap / weeks) 
	label var spell_share_`time'spring "share of enrollment spell in `time' spring window"
	drop overlap*

}
}

*****************************************************************************************************
*****************************************************************************************************

** GENERATE ENROLLMENT FLAT FILE ***

gen term_att = 1	
replace term_att= 0 if enrollment<3 // include half time or more (or blank) // enrollment == 0 

// loop over time windows
forvalues y = 1/$n_years_out {
foreach prefix in "in" "by" {

	// build time string
	local timelist `prefix'Y`y' `prefix'Y`y'fall `prefix'Y`y'spring
	
	foreach time of local timelist{
		// loop over college categories
		foreach cat of global collegeCategories {	
		
			// binary attempted enrollment
			qui gen byte att_`cat'_`time' = term_att * college_`cat' * (spell_share_`time' > 0)

			// save college category label
			local l_`cat': var label college_`cat'
		}
	}
}
}

* Get first college attended to compare to LTO
gen startdate = dofw(week_start)
format startdate %td
tostring cohort, gen(cohort_str)
gen hsgraddate = "7/1/"+cohort_str
gen hsgraddatetd = date(hsgraddate,"MDY")
format hsgraddatetd %td
g keepflag=0
replace keepflag=1 if startdate>=hsgraddatetd
egen firstattdate = min(startdate) if keepflag==1, by(sasid cohort)
egen firstattdate_school = min(startdate) if keepflag==1, by(sasid cohort ID_FSC)
format firstattdate firstattdate_school %td
gen keepflag2=0
replace keepflag2=1 if firstattdate_school == startdate & firstattdate_school== firstattdate


*add opeid
g ID_FSC_firstinst= substr(ID_FSC,1,6) if keepflag2==1
g collegename_firstinst= college if keepflag2==1

keep *first* att* college_*  sasid cohort 
duplicates drop
foreach d of varlist att* college_* {
	bys sasid: egen max = max(`d')
	replace `d'=max
	drop max
	}

foreach v of varlist *firstinst{
gsort - sasid  `v'
bys sasid : replace `v' = `v'[_n-1] if _n>1 & `v'[_n-1]!=""
}

drop *date*
duplicates drop

ren collegename college_name_firstinst

*now we have a flat file

************************************************************************************	
************************************************************************************

*** FORCE WITHIN WINDOW BINARY ENROLLMENT OUTCOMES TO BE DISJOINT *** 

// loop over time windows
forvalues t = 1/$n_years_out {
	
	// build time string
	local timelist Y`t' Y`t'fall Y`t'spring
	foreach time of local timelist{
	// for attempted variables
		foreach y in att { 
				
			// 4 and 2-year colleges: If both, code as 4 year so that 2yr is 2yr only.  
			replace `y'_2yr_in`time' = 0 if `y'_4yr_in`time'==1
			replace `y'_2yrMApub_in`time' = 0 if `y'_4yr_in`time'==1
			replace `y'_2yrpub_in`time' = 0 if `y'_4yr_in`time'==1
			replace `y'_2yrpriv_in`time' = 0 if `y'_4yr_in`time'==1
			
			// In-state and out-of-state: If both, code as in-state.
			replace `y'_out_in`time' = 0 if `y'_MA_in`time'==1
			
			// Public and private: If both, code as public.
			replace `y'_private_in`time' = 0 if `y'_public_in`time'==1
			replace `y'_4yrpriv_in`time' = 0 if `y'_4yrpub_in`time'==1
			replace `y'_4yrpriv_in`time' = 0 if `y'_4yrMApub_in`time'==1	
			
			// MA and out of MA public: If both, code as in-state
			replace `y'_OOS_public_in`time' = 0 if `y'_MA_public_in`time'==1
		}
	}
}

// disjoint outcomes
replace college_2yr = 0 if college_4yr==1
replace college_2yrpub = 0 if college_4yr==1
replace college_2yrMApub = 0 if college_4yr==1
replace college_2yrpriv = 0 if college_4yr==1
replace college_out = 0 if college_MA==1
replace college_private = 0 if college_public==1
replace college_2yrpriv = 0 if college_public==1
replace college_private = 0 if (college_4yrpub==1 | college_4yrMApub==1)
replace college_OOS_public = 0 if college_MA_public==1


// save flat file
gen source_nsc = 1
compress

** added by Astrid to eliminate duplicates

duplicates tag sasid cohort, gen(dup)
egen group = group(sasid cohort)	
	bys group: egen college_name_firstinst2 = mode(college_name_firstinst)
	bys group: egen ID_FSC_firstinst2 = mode(ID_FSC_firstinst)
drop group
	
replace college_name_firstinst = college_name_firstinst2 if college_name_firstinst == ""
replace ID_FSC_firstinst = ID_FSC_firstinst2 if ID_FSC_firstinst == ""

drop ID_FSC_firstinst2 college_name_firstinst2

duplicates report 
duplicates drop


save "$data/flat_enrollment_NSC.dta", replace


}
if $clean_degree==1{
// Author: Olivia Kim
// Date: July 25 2016
// Purpose: This code cleans the degrees file

// program parameters
local degrees 			AA BA CERT GRAD
local schooltypes		public private MA out MA_public OOS_public MA_private OOS_private

*****************************************************************************************************
*****************************************************************************************************

*** MATCH NSC ENROLLMENT SPELL DATES TO START AND END WEEKS ***

// load data
use "${data}/NSC_degrees_voter.dta", clear

g sub = substr(ID_FSC,1,6) +"-00"
foreach f in 001072-05 001536-01 001540-01 001564-01 002029-01 002029-03 002155-08 002155-10 002589-03 ///
	002629-02  003388-01  009828-12 010198-17 010394-11 010727-11 030106-18 002077-02 030425-01 001074-82 001434-80 ///
	002050-90 002120-80 002126-02 002145-80 002148-80 002150-80 002199-80 002199-83 002206-80 002208-80 ///
	002208-84 002218-81 002575-81 002580-83 002580-84 002580-86 002580-88 002580-91 002580-92 002580-93 ///
	002589-01 003329-85 003404-10 003410-81 003414-80 004861-80 010198-80 012393-83 021664-22 022606-10 023621-00 {
		qui replace ID_FSC= sub if ID_FSC=="`f'"
	}
	drop sub 
merge m:1 ID_FSC using "$data_clean/FSC.dta",    keep(match) nogen // assert(match using)
replace ID_IPEDS = 134237 if ID_FSC=="023621-00" //Full Sail University
merge m:1 ID_IPEDS using "$data_clean/IPEDS.dta", assert(match using) keep(match) nogen

// verify all characteristics are populated
assert !missing(ipeds_years) & !missing(ipeds_ownership) & !missing(ipeds_calendar)

// drop existing variables
gen college_4yr = college_years == 4 if !missing(college_years)
gen college_2yr = college_years == 2 if !missing(college_years)
drop college_ownership college_years

rename ipeds_* *
gen college_MA_private = college_private*college_MA
gen college_OOS_private = college_private*college_out
gen college_4yrpub	= (college_4yr & college_public)
gen college_4yrMApub= (college_4yr & college_MA_public)
gen college_4yrOOSpub= (college_4yr & college_OOS_public)
gen college_4yrpriv	= (college_4yr & college_private)
gen college_4yrMApriv	= (college_4yr & college_private & college_MA)
gen college_4yrOOSpriv	= (college_4yr & college_private & college_out)
gen college_2yrpub	= (college_2yr & college_public)
gen college_2yrMApub= (college_2yr & college_MA_public)
gen college_2yrpriv	= (college_2yr & college_private)


	* Merge on credential levels from NSC website
	 recast str60 degree_title, force
	 ren degree_title degree_title_name
	compress
		merge m:1 degree_title_name using "${data_clean}\nsc_credential_table.dta", keep(1 3) nogen


//deal with degree_title
gen degree=""
replace degree="BA" if CREDENTIAL_LEVEL_CODE=="BD"
replace degree="AA" if CREDENTIAL_LEVEL_CODE=="AD"
replace degree="GRAD" if CREDENTIAL_LEVEL_CODE=="DP"
replace degree="GRAD" if CREDENTIAL_LEVEL_CODE=="DR"
replace degree="GRAD" if CREDENTIAL_LEVEL_CODE=="MD"
replace degree="GRAD" if CREDENTIAL_LEVEL_CODE=="PC"
replace degree="CERT" if CREDENTIAL_LEVEL_CODE=="UC"

drop CREDENTIAL*

g degree_matched=0
replace degree_matched=1 if degree!=""

replace degree="BA" if regexm(degree_title, "BACHELOR")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "BACHELOR")==1
replace degree="AA" if regexm(degree_title, "ASSOCIATE")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "ASSOCIATE")==1
replace degree="AA" if regexm(degree_title, "ASSOC")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "ASSOC")==1 
replace degree="GRAD" if regexm(degree_title, "MASTER")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "MASTER")==1 
replace degree="CERT" if regexm(degree_title, "CERTIFICATE")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "CERTIFICATE")==1 
replace degree="CERT" if regexm(degree_title, "CERT")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "CERT")==1
replace degree="BA" if regexm(degree_title, "B.A.")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "B.A.")==1
replace degree="BA" if regexm(degree_title, "B.S.")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "B.S.")==1
replace degree="GRAD" if regexm(degree_title, "DOCTOR")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "DOCTOR")==1
replace degree="BA" if regexm(degree_title, "BACH")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "BACH")==1
replace degree="BA" if regexm(degree_title, "BACCA")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "BACCA")==1
replace degree="BA" if regexm(degree_title, "UNDERGRAD")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "UNDERGRAD")==1
replace degree="GRAD" if regexm(degree_title, "M.A.")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "M.A.")==1 
replace degree="GRAD" if regexm(degree_title, "M.S.")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "M.S.")==1 
replace degree="GRAD" if regexm(degree_title, "MS")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "MS")==1 
replace degree="BA" if regexm(degree_title, "BS")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "BS")==1
replace degree="AA" if regexm(degree_title, "AA")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "AA")==1 
replace degree="AA" if regexm(degree_title, "AS")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "AS")==1 
replace degree="BA" if regexm(degree_title, "BA")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "BA")==1
replace degree="BA" if regexm(degree_title, "AB")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "AB")==1
replace degree="BA" if regexm(degree_title, "DOUBLE MAJOR")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "DOUBLE MAJOR")==1
replace degree="GRAD" if regexm(degree_title, "ED.M.")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "ED.M.")==1 
replace degree="CERT" if regexm(degree_title, "DIPLOMA")==1 &degree_matched==0
replace degree_matched=1 if regexm(degree_title, "DIPLOMA")==1
tab degree_title if degree_matched==0, sort
tab degree_title years if degree_matched==0


// assign BA for 4-yr and AA for 2yr for those with no or with confusing degree title information
* this is only for the few that don't have degree info assigned above
replace degree = "BA" if (years == 4) & degree==""
replace degree = "AA" if (years == 2) & degree==""
replace degree = "CERT" if (years == 1) & degree==""
assert !mi(degree)

// generate weekly dataset
gen degree_week = wofd(graduated_date) 
format *week %tw

********************************************************************************
********************************************************************************

*** GENERATE COMPLETIONS OUTCOMES ****

// loop over time windows and degree types
forvalues y = 1/$n_years_out {
	
	local time Y`y'

	// binary completions outcome by degree type
	foreach str of local degrees {
		gen cmp_`str'_by`time' = (degree == "`str'") * (degree_week < ${week_end_by`time'})
	} 
	// binary completion (any)
	gen cmp_any_by`time' = (cmp_BA_by`time' | cmp_AA_by`time' )
	
} // end of time loop

// degree by institution characteristics
forvalues y = 1/$n_years_out {
	local time Y`y'
	foreach schooltype of local schooltypes{
		gen cmp_BA_`schooltype'_by`time' = cmp_BA_by`time'*college_`schooltype'
		gen cmp_AA_`schooltype'_by`time' = cmp_AA_by`time'*college_`schooltype'
	}
}

*SRC 9/6/2019
*duplicates due to NSC file
drop grad_year NSCdatayear searchbegin college_*  collegeseq youruniq  //college name sometimes inconsistent
duplicates drop
*remaining dups seem to be two dates within same institution, or two institutions (multiple degrees)
*within institution, keep earliest degree


*by degree type
bys sasid ID_FSC degree: egen min=min(graduated_date)
keep if graduated_date==min
drop  min
drop *CERT* *GRAD* 
drop if degree=="CERT"|degree=="GRAD"
duplicates drop 

*now only multi institution dups remain
*within degree type, take earliest degree
bys sasid degree:  egen min=min(graduated_date)
keep if graduated_date==min

*some are on same date -- randomly break ties
sample 1, count by(sasid graduated_date degree)
*now all that remains in BA and AA institutions

foreach d in AA BA{
	g ID_FSC_`d'=ID_FSC if degree=="`d'"
	g college_name_`d'=college if degree=="`d'"
	g graduated_date_`d'=graduated_date if degree=="`d'"
}

drop ID_FSC ID_IP ID_OPE college degree_title calendar years ownership degree degree_matched degree_week  graduated_date min
duplicates drop 

foreach v of varlist ID*A college*A{
sort sasid `v'
bys sasid : replace `v' = `v'[_n+1] if _n>=1 & `v'[_n+1]!=""
}
foreach v of varlist graduated_date*{
bys sasid : egen max =max(`v')
replace `v' = max
drop max
}
*these are people with both AA and BA
duplicates drop


// collapse to flat file 
collapse (max) cmp* , by(sasid cohort *BA *AA)

gen source_nsc = 1

*YOU CAN HAVE BOTH A BA AND AN AA
// save data
compress
save "$data/flat_degrees_NSC.dta", replace


}
